library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)
library(lattice)
library(naniar)
library(skimr)

Introduction

Data Preparation

Read In

input_dir <- fs::path("../input")
files <- fs::dir_ls(input_dir, glob = "*.csv")
his_dt <- read_csv(
  files[1],
  col_types = cols(
    package = col_character(),
    version = col_character(),
    date = col_date(format = "%Y-%m-%d"),
    repository = col_character()
  )
)
ov_dt <- read_csv(
  files[2],
  col_types = cols(
    package = col_character(),
    version = col_character(),
    depends = col_character(),
    imports = col_character(),
    license = col_character(),
    needs_compilation = col_logical(),
    author = col_character(),
    bug_reports = col_character(),
    url = col_character(),
    date_published = col_date(format = "%Y-%m-%d"),
    description = col_character(),
    title = col_character()
  )
)

Quick View

dplyr::glimpse(ov_dt, 100)
Rows: 18,388
Columns: 12
$ package           <chr> "A3", "AATtools", "ABACUS", "abbreviate", "abbyyR", "abc", "abc.data", "…
$ version           <chr> "1.0.0", "0.0.1", "1.0.0", "0.1", "0.5.5", "2.2.1", "1.0", "0.9.0", "1.0…
$ depends           <chr> "R (>= 2.15.0), xtable, pbapply", "R (>= 3.6.0)", "R (>= 3.1.0)", NA, "R…
$ imports           <chr> NA, "magrittr, dplyr, doParallel, foreach", "ggplot2 (>= 3.1.0), shiny (…
$ license           <chr> "GPL (>= 2)", "GPL-3", "GPL-3", "GPL-3", "MIT + file LICENSE", "GPL (>= …
$ needs_compilation <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, TRU…
$ author            <chr> "Scott Fortmann-Roe", "Sercan Kahveci [aut, cre]", "Mintu Nath [aut, cre…
$ bug_reports       <chr> NA, "https://github.com/Spiritspeak/AATtools/issues", NA, NA, "http://gi…
$ url               <chr> NA, NA, "https://shiny.abdn.ac.uk/Stats/apps/", "https://github.com/sigb…
$ date_published    <date> 2015-08-16, 2020-06-14, 2019-09-20, 2021-12-14, 2019-06-25, 2022-05-19,…
$ description       <chr> "Supplies tools for tabulating and analyzing the results of predictive m…
$ title             <chr> "Accurate, Adaptable, and Accessible Error Metrics for Predictive\nModel…

Data Quality

ov_dt |> 
  dplyr::arrange(date_published) |> 
  vis_miss()

Questions

Features

Separate version

ov_dt <- ov_dt |>
  separate(
    version,
    into =
      c("major", "minor", "patch"),
    sep = "\\.",
    extra = "merge",
    fill = "right",
    remove = FALSE
  )

Number of dependencies

ov_dt <- ov_dt |> 
  mutate(
    num_dep = purrr::map_int(
      .x = depends,
      .f = function(x){
        x |> 
          stringr::str_split(",", simplify = TRUE) |> 
          length()
      }
    ),
    num_dep = ifelse(is.na(depends), 0, num_dep)
  )

Number of imports

ov_dt <- ov_dt |> 
  mutate(
    num_imports = purrr::map_int(
      .x = imports,
      .f = function(x){
        x |> 
          stringr::str_split(",", simplify = TRUE) |> 
          length()
      }
    ),
    num_imports = ifelse(is.na(imports), 0, num_imports)
  )

Number of authors

ov_dt <- ov_dt |> 
  mutate(
    num_authors = purrr::map_int(
      .x = author,
      .f = function(x){
        x |> 
          stringr::str_split(",", simplify = TRUE) |> 
          length()
      }
    )
  )

Temporal features

ov_dt <- ov_dt |> 
  mutate(
    year = lubridate::year(date_published),
    month = lubridate::month(date_published, label = TRUE),
    day = lubridate::day(date_published),
    wday = lubridate::wday(date_published, label = TRUE),
    yr_mon = sprintf("%d-%s", year, month),
    dt = lubridate::ym(paste0(year, "-", month))
  )
Warning:  1 failed to parse.

Title & Description Lengths

ov_dt <- ov_dt |>
  mutate(
    len_title = purrr::map_int(title, ~ stringr::str_count(.x, "\\w+")),
    len_desc = purrr::map_int(description, ~ stringr::str_count(.x, "\\w+"))
  )

License

ov_dt <- ov_dt |> 
  mutate(
    license_cleaned = case_when(
      stringr::str_detect(license, "^GPL-3") ~ "GPL-3",
      stringr::str_detect(license, "^GPL\\s\\([\\s\\d\\.<=>]*3") ~ "GPL-3",
      stringr::str_detect(license, "^GPL-2") ~ "GPL-2",
      stringr::str_detect(license, "^GPL\\s\\([\\s\\d\\.<=>]*2") ~ "GPL-2",
      stringr::str_detect(license, "^AGPL") ~ "AGPL",
      stringr::str_detect(license, "^LGPL") ~ "LGPL",
      stringr::str_detect(license, "Apache") ~ "Apache",
      stringr::str_detect(license, "BSD") ~ "BSD",
      stringr::str_detect(license, "LGPL") ~ "LGPL",
      # stringr::str_detect(license, "GNU") ~ "GNU",
      stringr::str_detect(license, "MIT") ~ "MIT",
      stringr::str_detect(license, "CC0") ~ "CC0",
      # stringr::str_detect(license, "MPL") ~ "MPL",
      # stringr::str_detect(license, "Unlimited") ~ "Unlimited",
      # stringr::str_detect(license, "^CC") ~ "CC",
      license == "GPL" ~ "GPL",
      TRUE ~ "Other"
      )
  )

Bug Report Domain

Temporal Questions

ov_dt |> 
    group_by(dt) |> 
    summarise_at(vars(num_dep, num_imports), list(mean = mean)) |> 
  ggplot(aes(x= dt)) +
    geom_jitter(aes(y = num_dep_mean, color = "num_dep_mean"), alpha = 0.2) +
  geom_smooth(aes(y = num_dep_mean, color = "num_dep_mean"), span = 0.3, se = FALSE) +
  geom_jitter(aes(y = num_imports_mean, color = "num_imports_mean"), alpha = 0.2) +
  geom_smooth(aes(y = num_imports_mean, color = "num_imports_mean"), span = 0.3, se = FALSE) +
  theme_light()

ov_dt |> 
    group_by(dt) |> 
    summarise_at(vars(len_title, len_desc), list(median = median, sd = sd), na.rm = TRUE) |> 
  ggplot(aes(x= dt)) +
    geom_jitter(aes(y = len_title_median, color = "len_title_median"), alpha = 0.2) +
  geom_smooth(aes(y = len_title_median, color = "len_title_median"), span = 0.3, se = FALSE) +
  geom_jitter(aes(y = len_desc_median, color = "len_desc_median"), alpha = 0.2) +
  geom_smooth(aes(y = len_desc_median, color = "len_desc_median"), span = 0.3, se = FALSE) +
  theme_light()

ov_dt |> 
  ggplot(aes(x= date_published, y = len_title)) +
  geom_jitter(alpha = 0.05) +
  geom_smooth(span = 0.1, se = FALSE) +
  theme_light() +
  scale_y_log10()


ov_dt |> 
  ggplot(aes(x= date_published, y = len_desc)) +
  geom_jitter(alpha = 0.05) +
  geom_smooth(span = 0.2, se = FALSE) +
  theme_light() +
  scale_y_log10()

ov_dt |> 
  group_by(license_cleaned) |> 
  count() |> 
  ggplot(aes(x = forcats::fct_reorder(license_cleaned, n), y = n, fill = license_cleaned)) +
  geom_col() +
  coord_flip() +
  theme_minimal() +
  guides(fill = FALSE) +
  labs(x = "", y = "")
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

ov_dt |> 
  group_by(dt) |> 
  count(license_cleaned) |> 
  mutate(license_cleaned = forcats::fct_reorder(license_cleaned, n)) |> 
  ggplot(aes(x= dt, y = n, color = license_cleaned)) +
  # geom_line( alpha = 0.3) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.3, se = FALSE) +
  theme_light()

ov_dt |> 
  group_by(dt) |> 
  count(license_cleaned) |> 
  mutate(license_cleaned = forcats::fct_reorder(license_cleaned, n)) |> 
  ggplot(aes(x= dt, y = n, color = license_cleaned)) +
  # geom_line( alpha = 0.3) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.8, se = FALSE) +
  theme_light() +
  scale_y_log10()

ov_dt |> 
  group_by(dt) |> 
  count(url_exist = is.na(url)) |>  
  ggplot(aes(x= dt, y = n, color = url_exist)) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.3, se = FALSE) +
  theme_light()

ov_dt |> 
  group_by(dt) |> 
  count(url_exist = is.na(bug_reports)) |>  
  ggplot(aes(x= dt, y = n, color = url_exist)) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.3, se = FALSE) +
  theme_light()

ov_dt |> 
  group_by(dt) |> 
  count() |> 
  ggplot(aes(dt, n)) +
  geom_line()

ov_dt |> filter(!is.na(dt)) |> count(dt) |> arrange(dt) |> timetk::pad_by_time(.by = "month", .pad_value = 0) -> xdat
.date_var is missing. Using: dt
timetk::plot_seasonal_diagnostics(xdat, dt, n)
---
title: "CRAN History EDA"
author: "R Sangole"
output: html_notebook
---

```{r libraries, message=FALSE, warning=FALSE}
library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)
library(lattice)
library(naniar)
library(skimr)
```

# Introduction



# Data Preparation {.tabset}

## Read In {.tabset}

```{r}
input_dir <- fs::path("../input")
files <- fs::dir_ls(input_dir, glob = "*.csv")
his_dt <- read_csv(
  files[1],
  col_types = cols(
    package = col_character(),
    version = col_character(),
    date = col_date(format = "%Y-%m-%d"),
    repository = col_character()
  )
)
ov_dt <- read_csv(
  files[2],
  col_types = cols(
    package = col_character(),
    version = col_character(),
    depends = col_character(),
    imports = col_character(),
    license = col_character(),
    needs_compilation = col_logical(),
    author = col_character(),
    bug_reports = col_character(),
    url = col_character(),
    date_published = col_date(format = "%Y-%m-%d"),
    description = col_character(),
    title = col_character()
  )
)
```

## Quick View {.tabset}

```{r}
dplyr::glimpse(ov_dt, 100)
```



# Data Quality

```{r}
ov_dt |> 
  dplyr::arrange(date_published) |> 
  vis_miss()
```

# Questions

* How long did packages take from their first release to version 1.0? 
* What type of packages were most frequent in different years?
* Who are the most productive authors? 
* Can you predict the growth toward 2025?
* What license is most used? Has there been a change over time?
* How many packages use all CAPS, all small, or a mixture?
* How have the dependencies & imports changed over time?
* Which repositories do packages use? Github/Bitbucket etc. How do these vary over time?
* Do packages have URLs for bug reports?
* Is there any temporal patterns to when versions are submitted to CRAN?
* Have titles & descriptions gotten longer over time?
* Do authors use minor versions?

## Features

Separate version

```{r}
ov_dt <- ov_dt |>
  separate(
    version,
    into =
      c("major", "minor", "patch"),
    sep = "\\.",
    extra = "merge",
    fill = "right",
    remove = FALSE
  )
```


Number of dependencies

```{r}
ov_dt <- ov_dt |> 
  mutate(
    num_dep = purrr::map_int(
      .x = depends,
      .f = function(x){
        x |> 
          stringr::str_split(",", simplify = TRUE) |> 
          length()
      }
    ),
    num_dep = ifelse(is.na(depends), 0, num_dep)
  )
```

Number of imports

```{r}
ov_dt <- ov_dt |> 
  mutate(
    num_imports = purrr::map_int(
      .x = imports,
      .f = function(x){
        x |> 
          stringr::str_split(",", simplify = TRUE) |> 
          length()
      }
    ),
    num_imports = ifelse(is.na(imports), 0, num_imports)
  )
```

Number of authors

```{r}
ov_dt <- ov_dt |> 
  mutate(
    num_authors = purrr::map_int(
      .x = author,
      .f = function(x){
        x |> 
          stringr::str_split(",", simplify = TRUE) |> 
          length()
      }
    )
  )
```

Temporal features
```{r}
ov_dt <- ov_dt |> 
  mutate(
    year = lubridate::year(date_published),
    month = lubridate::month(date_published, label = TRUE),
    day = lubridate::day(date_published),
    wday = lubridate::wday(date_published, label = TRUE),
    yr_mon = sprintf("%d-%s", year, month),
    dt = lubridate::ym(paste0(year, "-", month))
  )
```


Title & Description Lengths

```{r}
ov_dt <- ov_dt |>
  mutate(
    len_title = purrr::map_int(title, ~ stringr::str_count(.x, "\\w+")),
    len_desc = purrr::map_int(description, ~ stringr::str_count(.x, "\\w+"))
  )
```


License 

```{r}
ov_dt <- ov_dt |> 
  mutate(
    license_cleaned = case_when(
      stringr::str_detect(license, "^GPL-3") ~ "GPL-3",
      stringr::str_detect(license, "^GPL\\s\\([\\s\\d\\.<=>]*3") ~ "GPL-3",
      stringr::str_detect(license, "^GPL-2") ~ "GPL-2",
      stringr::str_detect(license, "^GPL\\s\\([\\s\\d\\.<=>]*2") ~ "GPL-2",
      stringr::str_detect(license, "^AGPL") ~ "AGPL",
      stringr::str_detect(license, "^LGPL") ~ "LGPL",
      stringr::str_detect(license, "Apache") ~ "Apache",
      stringr::str_detect(license, "BSD") ~ "BSD",
      stringr::str_detect(license, "LGPL") ~ "LGPL",
      # stringr::str_detect(license, "GNU") ~ "GNU",
      stringr::str_detect(license, "MIT") ~ "MIT",
      stringr::str_detect(license, "CC0") ~ "CC0",
      # stringr::str_detect(license, "MPL") ~ "MPL",
      # stringr::str_detect(license, "Unlimited") ~ "Unlimited",
      # stringr::str_detect(license, "^CC") ~ "CC",
      license == "GPL" ~ "GPL",
      TRUE ~ "Other"
      )
  )
```

Bug Report Domain

```{r}
ov_dt <- ov_dt |>
  mutate(domain = purrr::map_chr(bug_reports,
                                 ~ {
                                   if (is.na(.x))
                                     return("")
                                   else
                                     return(urltools::url_parse(.x)$domain)
                                 }))
```


# Temporal Questions

* How long did packages take from their first release to version 1.0? 
* What type of packages were most frequent in different years?
* Who are the most productive authors? 
* Can you predict the growth toward 2025?
* What license is most used? Has there been a change over time? - done
* How many packages use all CAPS, all small, or a mixture?
* How have the dependencies & imports changed over time?
* Which repositories do packages use? Github/Bitbucket etc. How do these vary over time?
* Do packages have URLs for bug reports?
* Is there any temporal patterns to when versions are submitted to CRAN?
* Do authors use minor versions?
* Have titles & descriptions gotten longer over time? - done


* How have the dependencies & imports changed over time?

```{r}
ov_dt |> 
    group_by(dt) |> 
    summarise_at(vars(num_dep, num_imports), list(mean = mean)) |> 
  ggplot(aes(x= dt)) +
    geom_jitter(aes(y = num_dep_mean, color = "num_dep_mean"), alpha = 0.2) +
  geom_smooth(aes(y = num_dep_mean, color = "num_dep_mean"), span = 0.3, se = FALSE) +
  geom_jitter(aes(y = num_imports_mean, color = "num_imports_mean"), alpha = 0.2) +
  geom_smooth(aes(y = num_imports_mean, color = "num_imports_mean"), span = 0.3, se = FALSE) +
  theme_light()
```

* Have titles & descriptions gotten longer over time? 

```{r}
ov_dt |> 
    group_by(dt) |> 
    summarise_at(vars(len_title, len_desc), list(median = median, sd = sd), na.rm = TRUE) |> 
  ggplot(aes(x= dt)) +
    geom_jitter(aes(y = len_title_median, color = "len_title_median"), alpha = 0.2) +
  geom_smooth(aes(y = len_title_median, color = "len_title_median"), span = 0.3, se = FALSE) +
  geom_jitter(aes(y = len_desc_median, color = "len_desc_median"), alpha = 0.2) +
  geom_smooth(aes(y = len_desc_median, color = "len_desc_median"), span = 0.3, se = FALSE) +
  theme_light()
```

```{r}
ov_dt |> 
  filter(year %in% c(2022, 2020, 2018)) |> 
  ggplot() +
  geom_density(aes(x = len_desc, 
                   fill = as.factor(year), 
                   color = as.factor(year)
                   ),
               alpha = 0.3
               )
ov_dt |> 
  filter(year %in% c(2022, 2020, 2018)) |> 
  ggplot() +
  geom_histogram(aes(x = len_desc, 
                   fill = as.factor(year), 
                   color = as.factor(year)
                   ),
               alpha = 0.3
               ) +
  facet_wrap(~year)
```



```{r}
ov_dt |> 
  ggplot(aes(x= date_published, y = len_title)) +
  geom_jitter(alpha = 0.05) +
  geom_smooth(span = 0.1, se = FALSE) +
  theme_light() +
  scale_y_log10()

ov_dt |> 
  ggplot(aes(x= date_published, y = len_desc)) +
  geom_jitter(alpha = 0.05) +
  geom_smooth(span = 0.2, se = FALSE) +
  theme_light() +
  scale_y_log10()
```

```{r}

```


* What license is most used? Has there been a change over time?

```{r}
ov_dt |> 
  group_by(license_cleaned) |> 
  count() |> 
  ggplot(aes(x = forcats::fct_reorder(license_cleaned, n), y = n, fill = license_cleaned)) +
  geom_col() +
  coord_flip() +
  theme_minimal() +
  guides(fill = FALSE) +
  labs(x = "", y = "")
```

```{r}
ov_dt |> 
  group_by(dt) |> 
  count(license_cleaned) |> 
  mutate(license_cleaned = forcats::fct_reorder(license_cleaned, n)) |> 
  ggplot(aes(x= dt, y = n, color = license_cleaned)) +
  # geom_line( alpha = 0.3) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.3, se = FALSE) +
  theme_light()
ov_dt |> 
  group_by(dt) |> 
  count(license_cleaned) |> 
  mutate(license_cleaned = forcats::fct_reorder(license_cleaned, n)) |> 
  ggplot(aes(x= dt, y = n, color = license_cleaned)) +
  # geom_line( alpha = 0.3) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.8, se = FALSE) +
  theme_light() +
  scale_y_log10()
```

* Do packages have URLs for bug reports?


```{r}
ov_dt |> 
  group_by(dt) |> 
  count(url_exist = is.na(url)) |>  
  ggplot(aes(x= dt, y = n, color = url_exist)) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.3, se = FALSE) +
  theme_light()
ov_dt |> 
  group_by(dt) |> 
  count(url_exist = is.na(bug_reports)) |>  
  ggplot(aes(x= dt, y = n, color = url_exist)) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.3, se = FALSE) +
  theme_light()
```

* Which repositories do packages use? Github/Bitbucket etc. How do these vary over time?

```{r}
ov_dt |> 
  filter(domain != "") |> 
  mutate(domain = forcats::fct_lump_min(domain, 20)) |> 
  group_by(dt) |> 
  count(domain) |>  
  ggplot(aes(x= dt, y = n, color = domain)) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.5, se = FALSE) +
  theme_light()
```

* Is there any temporal patterns to when versions are submitted to CRAN?

```{r}
ov_dt |> 
  group_by(dt) |> 
  count() |> 
  ggplot(aes(dt, n)) +
  geom_line()

ov_dt |> 
  filter(!is.na(dt)) |> 
  count(dt) |> 
  arrange(dt) |> 
  timetk::pad_by_time(.by = "month", .pad_value = 0) -> xdat
timetk::plot_seasonal_diagnostics(xdat, dt, n)
```

